https://rstudio.com/products/rstudio/download/
install.packages("dplyr")
install.packages("readr")
install.packages("tidyr")
install.packages("ggplot2")
the vectors :
## [1] "numeric"
## [1] "character"
vectors, basic manipulations :
## [1] 3
## [1] 1 5
## [1] 1 5
## [1] "g" "g"
vectors, basic manipulations :
## [1] "a" "t" "g"
## [1] "a"
## [1] "a" "g" "a" "g" "a" "g" "a" "g" "a" "g"
## [1] "a" "a" "a" "g" "g" "g"
vectors, basic manipulations:
## [1] "a" "t" "g" "c"
## [1] 1 3 2 4 3
## attr(,"levels")
## [1] "a" "t" "g" "c"
# integer matrix
a = matrix(c(1,5,10,10),2,2)
# of string
b = rbind(c("a", "g"),c("t", "t"),c("c", "g"))
c = cbind(c("a", "g"),c("t", "t"),c("c", "g"))## [1] 3 2
## [,1] [,2] [,3]
## [1,] "a" "t" "c"
## [2,] "g" "t" "g"
## [1] 2 3
## [1] 1 10
## [1] "g" "t" "g"
## [1] "a" "t" "c"
## [1] 3
## [,1] [,2]
## [1,] "a" "g"
## [2,] "t" "t"
## [3,] "c" "g"
## [,1] [,2] [,3]
## [1,] "a" "t" "c"
## [2,] "g" "t" "g"
## [,1] [,2]
## [1,] "a" "g"
## [2,] "t" "t"
## [3,] "c" "g"
d = data.frame(v1=rep("a",10),v2=1:10,v3=runif(10))
dim(d)
d$v1
d$v4 = factor(rep(c("a", "b"),5),levels=c("a", "b"))
d[d$v4=="a",]
d[, "v2"]
d[,c(3,1)]
d[,c("v2", "v4")]
names(d)
summary(d)f = function(a,b){
return(a-b)
}
f(5,6)
f(b=5,a=6)
f = function(a=32,b=12){
a-b
}
f()
f(5,6)
f(b=5,a=6)
data = read.table("filename")
data = read.csv("filename")
# performant version
library(readr)
data = read_csv("filname")
data = read_delim("filename")
! avoid for loops (use vectors)
## Time difference of 0.01478052 secs
! avoid for loops (use vectors)
Vectorial version
## Time difference of 0.002064466 secs
## [1] 7.159487
sum, cumulated sum (cumsum), finite differences (diff), max, min …
sum, cumulated sum (cumsum), finite differences (diff), max, min …
Apply a function to each element of an object
a=data.frame(v1=runif(5000),v2=rnorm(5000),v3=rbinom(5000,5,0.2))
# apply to each line
r=apply(a,1,sum)
head(r);class(r);dim(r)
# apply to each column
r=apply(a,2,function(col){c(max(col),which.max(col))})
r;class(r);dim(r)
# apply to all elements of a list
b=list(v1=runif(5000),v2=rnorm(5000),v3=rbinom(5000,5,0.2))
r=lapply(b,which.max)
r;class(r)
# simplification of the result
r=sapply(b,which.max)
r;class(r)better than loops…
Select a part of the data
## v1 v2 v3
## 806 0.9963570 -0.93142049 3
## 2301 0.9959883 -0.05808224 3
## 4597 0.9905205 -0.80389022 3
## v1 v2 v3
## 806 0.9963570 -0.93142049 3
## 2301 0.9959883 -0.05808224 3
## 4597 0.9905205 -0.80389022 3
Pretreat variables to construct factors // intervals
## [1] "factor"
## [1] (-2,1] (-2,1] (-2,1] (1,2] (-2,1] (-2,1]
## Levels: (-Inf,-3] (-3,-2] (-2,1] (1,2] (2, Inf]
## [1] 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34
## [26] 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49
## [1] 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68
## [20] 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87
## [39] 88 89 90 91 92 93 94 95 96 97 98 99 100
## [1] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [13] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [25] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [37] FALSE FALSE FALSE FALSE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [49] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [61] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [73] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [85] TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [1] NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA
## [26] NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA 1 2 3 4 5 6 7 8 9 10
## [51] 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35
## [76] 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51
##
## a c g t
## 125 125 125 125
## v2
## v1 0 1 2 3 4 5 6 7 8 9
## a 0 1 17 31 27 28 14 6 1 0
## c 3 6 16 21 33 22 17 4 3 0
## g 0 8 21 26 31 23 12 3 1 0
## t 2 2 13 27 23 31 19 7 0 1
2 libraries for easy data manipulation (Cheatsheet) ! Introduction of a new operator !
Sequence of operations, introduction of the pipe operator :
x%>% f(y) =f(x,y)
x%>% f(y) %>% g(z) = g(f(x,y),z)
Easy to write, Easy to read
data %>% filter(condition)
data %>% distinct(v1)
data %>% sample_n(15,replace=FALSE)
data %>% sample_frac(0.2)
data %>% top_n(5,v1)
data %>% slice(20:30)
data %>% select(v1,v2)
data %>% select(contains('var'))
data %>% select(-v3)
data %>% pull(v3)
data %>% mutate(v3=v1/v2)
data %>% rename(v4=v1)
data %>% arrange(v4)
data %>% arrange(desc(v4))
data %>% summarize(v1m=mean(v1))
data %>% group_by(group) %>% summarise(v1m=mean(v1))
data %>% group_by(group) %>% summarise(v1med=median(v1))
Aggregation function : mean,median,n,sum,max,min,… Shortcut
data %>% goup_by(v4) %>% summarize(n=n())
data %>% count(v4)
data1 %>% mutate(v2=cumsum(v1))
data1 %>% mutate(v2=if_else(v1==32, "a", "b"))
data1 %>% mutate(v2=case_when(v1==32 ~ "a",v1==33 & v4<5 ~"b", TRUE ~ c))
data1 %>% mutate(v2=lag(v1))
data1 %>% mutate(v2=lead(v4))
! after a group_by to mutate by groups.
data1 %>% left_join(data2, by=c("v1"="v2"))
data1 %>% right_join(data2)
data1 %>% inner_join(data2)
data1 %>% full_join(data2)
library(dplyr)
library(tidyr)
df=expand_grid(year=2015:2020,
countries=c("France", "Italy", "Morocco"))
df$value=runif(nrow(df))
df[1:3,]## # A tibble: 3 × 3
## year countries value
## <int> <chr> <dbl>
## 1 2015 France 0.746
## 2 2015 Italy 0.542
## 3 2015 Morocco 0.751
## # A tibble: 3 × 7
## countries `2015` `2016` `2017` `2018` `2019` `2020`
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 France 0.746 0.436 0.574 0.775 0.419 0.984
## 2 Italy 0.542 0.0340 0.786 0.933 0.263 0.813
## 3 Morocco 0.751 0.106 0.405 0.686 0.819 0.349
## # A tibble: 18 × 3
## countries year value
## <chr> <chr> <dbl>
## 1 France 2015 0.746
## 2 France 2016 0.436
## 3 France 2017 0.574
## 4 France 2018 0.775
## 5 France 2019 0.419
## 6 France 2020 0.984
## 7 Italy 2015 0.542
## 8 Italy 2016 0.0340
## 9 Italy 2017 0.786
## 10 Italy 2018 0.933
## 11 Italy 2019 0.263
## 12 Italy 2020 0.813
## 13 Morocco 2015 0.751
## 14 Morocco 2016 0.106
## 15 Morocco 2017 0.405
## 16 Morocco 2018 0.686
## 17 Morocco 2019 0.819
## 18 Morocco 2020 0.349
Exercise 1, 2 and 3 of GoT: https://comeetie.github.io/got/got_tp.html
Make a card representing the male first names most frequently given to children born in 2005 for all French departments. The data to be used are available in the data directory: